The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

User defined parameters

print(params)
## $autoKfold
## [1] FALSE
## 
## $run
## [1] TRUE
## 
## $test_run
## [1] FALSE
## 
## $save_figs
## [1] FALSE
## 
## $trimAnomalies
## [1] FALSE
## 
## $ecoregion
## [1] "shrubGrass"
## 
## $response
## [1] "C3GramCover_prop"
## 
## $removeTexasLouisianaPlain
## [1] FALSE
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
trimAnom <- params$trimAnomalies
removeTLP <- params$removeTexasLouisianaPlain
run <- params$run
autoKfold <- params$autoKfold
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
 
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)

read in data

Data compiled in the prepDataForModels.R script

here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf.rds"))

# For all response variables, make sure there are no 0s add  .0001 from each, since the Gamma model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0  & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001

# for proportions, change to percentage scale (rather than proportion)
modDat$AngioTreeCover_prop <- modDat$AngioTreeCover_prop * 100
modDat$ConifTreeCover_prop <- modDat$ConifTreeCover_prop * 100
modDat$C4GramCover_prop <- modDat$C4GramCover_prop * 100
modDat$C3GramCover_prop <- modDat$C3GramCover_prop * 100
modDat$ForbCover_prop <- modDat$ForbCover_prop * 100

#change 0s to .000001 so model will run
modDat[modDat$AngioTreeCover_prop == 0 & !is.na(modDat$AngioTreeCover_prop), "AngioTreeCover_prop"] <- 0.0001
modDat[modDat$ConifTreeCover_prop == 0 & !is.na(modDat$ConifTreeCover_prop), "ConifTreeCover_prop"] <- 0.0001
modDat[modDat$C4GramCover_prop == 0 & !is.na(modDat$C4GramCover_prop), "C4GramCover_prop"] <- 0.0001
modDat[modDat$C3GramCover_prop == 0 & !is.na(modDat$C3GramCover_prop), "C3GramCover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001

Prep data

set.seed(1234)
modDat_1 <- modDat %>% 
  dplyr::select(-c(prcp_annTotal:annVPD_min)) %>% 
  # mutate(Lon = st_coordinates(.)[,1], 
  #        Lat = st_coordinates(.)[,2])  %>% 
  # st_drop_geometry() %>% 
  # filter(!is.na(newRegion))
  rename("tmin" = tmin_meanAnnAvg_CLIM, 
     "tmax" = tmax_meanAnnAvg_CLIM, #1
     "tmean" = tmean_meanAnnAvg_CLIM, 
     "prcp" = prcp_meanAnnTotal_CLIM, 
     "t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
     "t_cold" = T_coldestMonth_meanAnnAvg_CLIM, 
     "prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
     "prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM, 
     "prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
     "prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM,  #3
     "abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM, 
     "isothermality" = isothermality_meanAnnAvg_CLIM, #4
     "annWatDef" = annWaterDeficit_meanAnnAvg_CLIM, 
     "annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
     "VPD_mean" = annVPD_mean_meanAnnAvg_CLIM, 
     "VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
     "VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
     "VPD_max_95" = annVPD_max_95percentile_CLIM, 
     "annWatDef_95" = annWaterDeficit_95percentile_CLIM, 
     "annWetDegDays_5" = annWetDegDays_5percentile_CLIM, 
     "frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM, 
     "frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM, 
     "soilDepth" = soilDepth, #7
     "clay" = surfaceClay_perc, 
     "sand" = avgSandPerc_acrossDepth, #8
     "coarse" = avgCoarsePerc_acrossDepth, #9
     "carbon" = avgOrganicCarbonPerc_0_3cm, #10
     "AWHC" = totalAvailableWaterHoldingCapacity,
     ## anomaly variables
     tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
     tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
     tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
    prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
      t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom,  #19
     t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
      prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
      precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom,  #22
    prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23 
     prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
      aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25  
    isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
       annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
     annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom,  #28
      VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
      VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom,  #30
      VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom,  #31
     VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
      annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33 
      annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom ,  #34
    frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35 
      frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
  ) %>% 
  dplyr::select(-c(tmin_meanAnnAvg_3yr:durationFrostFreeDays_meanAnnAvg_3yr))

# small dataset for if testing the data
if(test_run) {
  modDat_1 <- slice_sample(modDat_1, n = 1e5)
}

Add a constant to the response variable (+2) so that models run…

modDat_1 <- modDat_1 %>% 
  mutate(response_transformed = .[[response]] + 2) 
names(modDat_1)[names(modDat_1) == "response_transformed"] <- paste0(response, "_transformed")

Identify the ecoregion and response variable type to use in this model run

ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is shrubGrass and the response variable is C3GramCover_prop"

Visualize the predictor variables

The following are the candidate predictor variables for this ecoregion:

if (ecoregion == "shrubGrass") {
  # select potential predictor variables for the ecoregion of interest
        prednames <-
          c(
"tmean"             , "prcp"                    ,"prcp_seasonality"        ,"prcpTempCorr"          , 
"isothermality"     , "annWatDef"               ,"sand"                    ,"coarse"                , 
"carbon"            , "AWHC"                    ,"tmin_anom"               ,"tmax_anom"             , 
"t_warm_anom"       , "prcp_wet_anom"           ,"precp_dry_anom"          ,"prcp_seasonality_anom" , 
"prcpTempCorr_anom" , "aboveFreezingMonth_anom" ,"isothermality_anom"      ,"annWatDef_anom"        , 
"annWetDegDays_anom", "VPD_mean_anom"           ,"VPD_min_anom"            ,"frostFreeDays_5_anom"   )
  
} else if (ecoregion %in% c("forest", "eastForest", "westForest")) {
  # select potential predictor variables for the ecoregion of interest
  prednames <- 
    c(
"tmean"                 ,"prcp"               , "prcp_dry"                , "prcpTempCorr"      ,     
"isothermality"         ,"annWatDef"          , "clay"                    , "sand"              ,     
"coarse"                ,"carbon"             , "AWHC"                    , "tmin_anom"         ,     
"tmax_anom"             ,"prcp_anom"          , "prcp_wet_anom"           , "precp_dry_anom"    ,     
"prcp_seasonality_anom" ,"prcpTempCorr_anom"  , "aboveFreezingMonth_anom" , "isothermality_anom",     
"annWatDef_anom"        ,"annWetDegDays_anom" , "VPD_mean_anom"           , "VPD_max_95_anom"   ,     
"frostFreeDays_5_anom"   )
} else if (ecoregion == "CONUS") {
  prednames <- c(
    "tmean"               ,"prcp"               ,"prcp_seasonality", "prcpTempCorr"       ,  "isothermality"     ,     
"annWetDegDays"           ,"sand"               ,"coarse"         , "AWHC"                , "tmin_anom"          ,    
"tmax_anom"               ,"prcp_wet_anom"      ,"precp_dry_anom" , "prcp_seasonality_anom", "prcpTempCorr_anom" ,     
"aboveFreezingMonth_anom" ,"isothermality_anom" ,"annWatDef_anom" , "annWetDegDays_anom"  , "VPD_mean_anom"      ,    
"VPD_max_95_anom"         ,"frostFreeDays_5_anom"   
  )
} 

# get the name of the transformed response
response_t <- paste0(response, "_transformed")

Scale the predictor variables for the model-fitting process

allPreds <- modDat_1 %>% 
  dplyr::select(tmin:frostFreeDays,tmean_anom:frostFreeDays_anom, soilDepth:AWHC) %>% 
  names()
modDat_1_s <- modDat_1 %>% 
  mutate(across(all_of(allPreds), base::scale, .names = "{.col}_s")) 
# names(modDat_1_s) <- c(names(modDat_1),
#                        paste0(prednames, "_s")
#                        )
#save model input data after its been scaled
saveRDS(modDat_1_s, file = "./models/scaledModelInputData.rds")

Subset the data to only include data for the ecoregion of interest

if (ecoregion == "shrubGrass") {
  # select data for the ecoregion of interest
  modDat_1_s <- modDat_1_s %>%
    filter(newRegion == "dryShrubGrass")
} else if (ecoregion == "forest") {
  # select data for the ecoregion of interest
  modDat_1_s <- modDat_1_s %>% 
    filter(newRegion %in% c("eastForest", "westForest"))
} else if (ecoregion == "CONUS") {
  modDat_1_s <- modDat_1_s
} else if (ecoregion == "eastForest") {
   modDat_1_s <- modDat_1_s %>% 
    filter(newRegion == "eastForest")
} else if (ecoregion == "westForest") {
    modDat_1_s <- modDat_1_s %>% 
    filter(newRegion == "westForest")
}
# remove the rows that have no observations for the response variable of interest
modDat_1_s <- modDat_1_s[!is.na(modDat_1_s[,response]),]
# subset the data to only include these predictors, and remove any remaining NAs 
modDat_1_s <- modDat_1_s %>% 
  dplyr::select(prednames, paste0(prednames, "_s"), response, response_t, newRegion, Year, Long, Lat, NA_L1NAME, NA_L2NAME) %>% 
  drop_na()

names(prednames) <- prednames
df_pred <- modDat_1_s[, prednames]
# 
# # print the list of predictor variables
# knitr::kable(format = "html", data.frame("Possible_Predictors" = prednames), row.names = FALSE
# ) %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Visualize the response variable

hist(modDat_1_s[,response], main = paste0("Histogram of ",response),
     xlab = paste0(response))

create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1_s[prednames] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
summaries of possible predictor variables
variable value_mean value_min value_median value_max
AWHC 12.5672 0.8442 12.5912 32.8869
VPD_mean_anom -0.0228 -0.2188 -0.0197 0.1091
VPD_min_anom -0.0103 -0.1116 -0.0144 0.1198
aboveFreezingMonth_anom 0.0965 -1.4133 0.0833 1.9000
annWatDef 115.7174 4.9442 100.1345 525.9683
annWatDef_anom -0.0837 -2.6512 -0.0601 1.0000
annWetDegDays_anom -0.0148 -1.7031 -0.0077 1.0000
carbon 0.9910 0.0383 0.7381 34.6588
coarse 10.6410 0.0000 8.4237 77.8683
frostFreeDays_5_anom -16.1274 -212.0000 -4.0000 27.9000
isothermality 37.5022 25.3563 37.0312 57.3978
isothermality_anom 0.5818 -7.9806 0.4742 8.2485
prcp 347.7057 58.8253 327.3648 1650.2117
prcpTempCorr -0.1356 -0.7677 -0.1228 0.6516
prcpTempCorr_anom 0.0362 -0.6154 0.0473 0.5744
prcp_seasonality 0.9432 0.5289 0.8960 2.2148
prcp_seasonality_anom -0.0162 -0.7093 -0.0097 0.4558
prcp_wet_anom -0.0258 -1.4840 0.0011 0.7128
precp_dry_anom -0.1496 -9.0000 0.0969 1.0000
sand 49.0710 0.3922 48.6289 96.5456
t_warm_anom -0.4269 -3.0721 -0.4580 2.8399
tmax_anom -0.2710 -2.5024 -0.2897 1.7192
tmean 9.5579 1.4215 8.4133 24.7107
tmin_anom -0.5312 -4.2545 -0.5139 2.6098
# response_summary <- modDat_1_s %>% 
#     dplyr::select(#where(is.numeric), -all_of(pred_vars),
#       matches(response)) %>% 
#     create_summary()
# 
# 
# kable(response_summary, 
#       caption = 'summaries of response variables, calculated using paint') %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 

Histograms of raw and scaled predictors

scaleFigDat_1 <- modDat_1_s %>% 
  dplyr::select(c(Long, Lat, Year, prednames)) %>% 
  pivot_longer(cols = all_of(names(prednames)), 
               names_to = "predNames", 
               values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>% 
  dplyr::select(c(Long, Lat, Year, paste0(prednames, "_s"))) %>% 
  pivot_longer(cols = all_of(paste0(prednames,"_s"
                                    )), 
               names_to = "predNames", 
               values_to = "predValues_scaled", 
               names_sep = ) %>% 
  mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))

scaleFigDat_3 <- scaleFigDat_1 %>% 
  left_join(scaleFigDat_2)

ggplot(scaleFigDat_3) + 
  facet_wrap(~predNames, scales = "free") +
  geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") + 
  geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
  xlab ("predictor variable values") + 
  ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")

modDat_1_s <- modDat_1_s %>% 
  rename_with(~paste0(.x, "_raw"), 
                any_of(names(prednames))) %>% 
  rename_with(~str_remove(.x, "_s$"), 
              any_of(paste0(names(prednames), "_s")))

Plot predictor vars against each other

set.seed(12011993)
# function for colors
my_fn <- function(data, mapping, method="p", use="pairwise", ...){
  
  # grab data
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  
  # calculate correlation
  corr <- cor(x, y, method=method, use=use)
  
  # calculate colour based on correlation value
  # Here I have set a correlation of minus one to blue, 
  # zero to white, and one to red 
  # Change this to suit: possibly extend to add as an argument of `my_fn`
  colFn <- colorRampPalette(c("red", "white", "blue"), interpolate ='spline')
  fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
  
  ggally_cor(data = data, mapping = mapping, size = 2.5, stars = FALSE, 
             digits = 2, colour = I("black"),...) + 
    theme_void() +
    theme(panel.background = element_rect(fill=fill))
  
}

if (run == TRUE) {
(corrPlot <- modDat_1_s %>% 
  dplyr::select(prednames) %>% 
  slice_sample(n = 5e4) %>% 
  #select(-matches("_")) %>% 
ggpairs( upper = list(continuous = my_fn, size = .1), lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.1)), progress = FALSE))
    base::saveRDS(corrPlot, paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
  
  } else {
    # corrPlot <- readRDS(paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
    # (corrPlot)
    print(c("See previous correlation figures"))
  }

Predictor variables compared to binned response variables

set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <-  modDat_1_s  %>% 
  slice_sample(n = 5e4) %>% 
   rename(response = all_of(response_t)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  dplyr::select(c(response, prednames)) %>% 
  tidyr::pivot_longer(cols = unname(prednames), 
               names_to = "predictor", 
               values_to = "value"
               )  
 

  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor , scales = 'free_y') + 
  ylab("Predictor Variable Values") + 
    xlab(response)

Model Fitting

Visualize the spatial blocks and how they differ across environmental space

First, if there are observations in Louisiana, sub-sample them so they’re not so over-represented in the dataset

## make data into spatial format
modDat_1_sf <- modDat_1_s %>% 
  st_as_sf(coords = c("Long", "Lat"), crs = st_crs("PROJCRS[\"unnamed\",\n    BASEGEOGCRS[\"unknown\",\n        DATUM[\"unknown\",\n            ELLIPSOID[\"Spheroid\",6378137,298.257223563,\n                LENGTHUNIT[\"metre\",1,\n                    ID[\"EPSG\",9001]]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433,\n                ID[\"EPSG\",9122]]]],\n    CONVERSION[\"Lambert Conic Conformal (2SP)\",\n        METHOD[\"Lambert Conic Conformal (2SP)\",\n            ID[\"EPSG\",9802]],\n        PARAMETER[\"Latitude of false origin\",42.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-100,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",25,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",60,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"easting\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]],\n        AXIS[\"northing\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]]]"))


# download map info for visualization
data(state_boundaries_wgs84) 

cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
  dplyr::filter(NAME!="Hawaii") %>%
  dplyr::filter(NAME!="Alaska") %>%
  dplyr::filter(NAME!="Puerto Rico") %>%
  dplyr::filter(NAME!="American Samoa") %>%
  dplyr::filter(NAME!="Guam") %>%
  dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
  dplyr::filter(NAME!="United States Virgin Islands") %>%
  sf::st_sf() %>%
  sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
  #sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))
if (ecoregion %in% c("Forest", "eastForest", "forest")){
modDat_1_s$uniqueID <- 1:nrow(modDat_1_s)
modDat_1_sf$uniqueID <- 1:nrow(modDat_1_sf)

# find which observations overlap with Louisiana
obs_LA_temp <- st_intersects(modDat_1_sf, cropped_states[cropped_states$NAME == "Louisiana",], sparse = FALSE)
if (sum(obs_LA_temp) > 0) {
 obs_LA_1 <- modDat_1_sf[which(obs_LA_temp == TRUE, arr.ind = TRUE)[,1],]
# now, find only those within the oversampled area (near the coast)
dims <- c(xmin = 730439.1, xmax = 1042195.5, ymax = -1222745.2, ymin = -1390430.9)
badBox <- st_bbox(dims) %>% 
  st_as_sfc() %>% 
  st_set_crs(value = st_crs(modDat_1_sf))
  
obs_LA_temp2 <- st_intersects(obs_LA_1, badBox, sparse = FALSE)
obs_LA_2 <- obs_LA_1[which(obs_LA_temp2 == TRUE, arr.ind = TRUE)[,1],]

# subsample so there aren't so many 
# get every 7th observation
obs_LA_sampled <- obs_LA_2[seq(from = 1, to = nrow(obs_LA_2), by = 10),]
# remove observations that aren't sampled from the larger dataset
obsToRemove <- obs_LA_2[!(obs_LA_2$uniqueID %in% obs_LA_sampled$uniqueID),]

modDat_1_sf <- modDat_1_sf[!(modDat_1_sf$uniqueID %in% obsToRemove$uniqueID),]
modDat_1_s <- modDat_1_s[!(modDat_1_s$uniqueID %in% obsToRemove$uniqueID),] 
}
# 
# ggplot() +
#   geom_sf(data = cropped_states[cropped_states$NAME == "Louisiana",]) +
#  geom_sf(data = obs_LA_1, col = "black") +
#   geom_sf(data = modDat_1_sf, col = "red", pch = 20) +
#   $geom_hline(aes(yintercept = -590062))
  #ylim(c(-1390431, -1222745)) +
  #xlim(c(730439.1, 1042196))
}
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames)])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))

if (ecoregion == "shrubGrass") {
  print("We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain,' 'Texas-Lousiana Coastal plain,' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)." )
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
 
    modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND WEST COAST FOREST"
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "UPPER GILA MOUNTAINS"), "NA_L2NAME"] <- "MEDITERRANEAN CALIFORNIA AND UPPER GILA MOUNTAINS"

modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS"
#////
modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
 
    modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND WEST COAST FOREST"
  
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "UPPER GILA MOUNTAINS"), "NA_L2NAME"] <- "MEDITERRANEAN CALIFORNIA AND UPPER GILA MOUNTAINS"

modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS"
 if (response == "CAMCover") {
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN PIEDMONT", "SEMIARID PLAIN AND PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES AND PIEDMONT"
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MEDITERRANEAN PIEDMONT", "SEMIARID PLAIN AND PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES AND PIEDMONT"
 } else if (response %in% c("C4GramCover_prop", "C3GramCover_prop")) {
     modDat_1_s[modDat_1_s$NA_L2NAME %in% c("SEMIARID PLAIN AND PRAIRIES", "TEMPERATE PRAIRIES"), "NA_L2NAME"] <- "SEMIARID AND TEMPERATE PRAIRIES"
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("SEMIARID PLAIN AND PRAIRIES", "TEMPERATE PRAIRIES"), "NA_L2NAME"] <- "SEMIARID AND TEMPERATE PRAIRIES" 
 }

} else if (ecoregion == "CONUS") {
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("EVERGLADES", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND WEST COAST FOREST"
  
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS"
  
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "UPPER GILA MOUNTAINS"), "NA_L2NAME"] <- "MEDITERRANEAN CALIFORNIA AND UPPER GILA MOUNTAINS"
  
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS", "SOUTHEASTERN USA PLAINS",  "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN AND MIXED WOOD PLAINS"
   
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("SOUTH CENTRAL SEMIARID PRAIRIES", "TEXAS-LOUISIANA COASTAL PLAIN"), "NA_L2NAME"] <- "SOUTH CENTRAL SEMIARID PRAIRIES"
  #///
  
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("EVERGLADES", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
  
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND WEST COAST FOREST"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "UPPER GILA MOUNTAINS"), "NA_L2NAME"] <- "MEDITERRANEAN CALIFORNIA AND UPPER GILA MOUNTAINS"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("OZARK/OUACHITA-APPALACHIAN FORESTS AND MIXED WOOD PLAINS", "SOUTHEASTERN USA PLAINS",  "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN AND MIXED WOOD PLAINS"
   
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("SOUTH CENTRAL SEMIARID PRAIRIES", "TEXAS-LOUISIANA COASTAL PLAIN"), "NA_L2NAME"] <- "SOUTH CENTRAL SEMIARID PRAIRIES"
  
  if (response %in% c("C4GramCover_prop")) {
    modDat_1_s[modDat_1_s$NA_L2NAME %in% c("CENTRAL USA PLAINS", "TEMPERATE PRAIRIES", "SOUTHEASTERN AND MIXED WOOD PLAINS", "ATLANTIC HIGHLANDS", "MIXED WOOD SHIELD"), "NA_L2NAME"] <- "EASTERN AND MIXED WOOD PLAINS AND FOREST"
  #///
  
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in%c("CENTRAL USA PLAINS", "TEMPERATE PRAIRIES", "SOUTHEASTERN AND MIXED WOOD PLAINS", "ATLANTIC HIGHLANDS", "MIXED WOOD SHIELD"), "NA_L2NAME"] <- "EASTERN AND MIXED WOOD PLAINS AND FOREST"
  }
} else if (ecoregion == "forest"  & response != "CAMCover") {
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "EVERGLADES"), "NA_L2NAME"] <- "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS    AND EVERGLADES"
   
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("UPPER GILA MOUNTAINS", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS"
   
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS\tAND EVERGLADES", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS"
   
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("ATLANTIC HIGHLANDS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "HIGHLANDS AND APPALACHIAN FORESTS"
   
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("CENTRAL USA PLAINS", "MIXED WOOD PLAINS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOOD PLAINS"
   
   modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD SHIELD", "CENTRAL AND MIXED WOOD PLAINS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOOD PLAINS AND MIXED WOOD SHIELD"
   
       ## divide southeastern US plains into two regions, since it's by far the largest group
  modDat_1_s[modDat_1_s$NA_L2NAME == "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS" &
               modDat_1_s$Long < -966595#-1773969
               , "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"
  modDat_1_s[modDat_1_s$NA_L2NAME == "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS", "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 2"
   #///
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "EVERGLADES"), "NA_L2NAME"] <- "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS  AND EVERGLADES"
   
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("UPPER GILA MOUNTAINS", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS"
   
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS\tAND EVERGLADES", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS"
   
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("ATLANTIC HIGHLANDS", "OZARK/OUACHITA-APPALACHIAN FORESTS"), "NA_L2NAME"] <- "HIGHLANDS AND APPALACHIAN FORESTS"
   
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("CENTRAL USA PLAINS", "MIXED WOOD PLAINS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOOD PLAINS"
   
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD SHIELD", "CENTRAL AND MIXED WOOD PLAINS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOOD PLAINS AND MIXED WOOD SHIELD"
          ## divide southeastern US plains into two regions, since it's by far the largest group
  modDat_1_sf[modDat_1_sf$NA_L2NAME == "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS" &
              st_coordinates(modDat_1_sf)[,1] < -966595#-1773969
                , ]$NA_L2NAME <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"
  modDat_1_sf[modDat_1_sf$NA_L2NAME == "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS", "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 2"
  
  if (response %in% c("C3GramCover_prop", "C4GramCover_prop") ) {
     modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"
     modDat_1_s[modDat_1_s$NA_L2NAME %in% c("CENTRAL AND MIXED WOOD PLAINS AND MIXED WOOD SHIELD", "HIGHLANDS AND APPALACHIAN FORESTS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOODS AND HIGHLANDS FORESTS"
     #//
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"), "NA_L2NAME"] <- "WESTERN CORDILLERA AND UPPER GILA MOUNTAINS 1"
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("CENTRAL AND MIXED WOOD PLAINS AND MIXED WOOD SHIELD", "HIGHLANDS AND APPALACHIAN FORESTS"), "NA_L2NAME"] <- "CENTRAL AND MIXED WOODS AND HIGHLANDS FORESTS" 
  
  }
  
} else if (ecoregion == "forest" & response == "CAMCover") {
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("OZARK/OUACHITA-APPALACHIAN FORESTS", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"
    
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "MARINE WEST COAST AND WESTERN CORDILLERA"
    
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS", "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"
  
  #///
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("OZARK/OUACHITA-APPALACHIAN FORESTS", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"
    
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MARINE WEST COAST FOREST", "WESTERN CORDILLERA"), "NA_L2NAME"] <- "MARINE WEST COAST AND WESTERN CORDILLERA"
    
  modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS", "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND APPALACHIAN FORESTS"
  
} else if (ecoregion == "eastForest") {
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("EVERGLADES", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"

  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS","CENTRAL USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN AND CENTRAL USA PLAINS"

  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD SHIELD", "ATLANTIC HIGHLANDS"), "NA_L2NAME"] <- "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD"

  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS", "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD"), "NA_L2NAME"] <- "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD AND PLAINS"

  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("SOUTHEASTERN AND CENTRAL USA PLAINS", "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD AND PLAINS"), "NA_L2NAME"] <- "PLAINS AND HIGHLANDS AND SHIELD"

  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND COAST"

  # #////
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("EVERGLADES", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"), "NA_L2NAME"] <- "EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"

   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS","CENTRAL USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN AND CENTRAL USA PLAINS"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD SHIELD", "ATLANTIC HIGHLANDS"), "NA_L2NAME"] <- "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS", "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD"), "NA_L2NAME"] <- "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD AND PLAINS"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("SOUTHEASTERN AND CENTRAL USA PLAINS", "ATLANTIC HIGHLANDS AND MIXED WOOD SHIELD AND PLAINS"), "NA_L2NAME"] <- "PLAINS AND HIGHLANDS AND SHIELD"
  
   modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("EVERGLADES MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "SOUTHEASTERN USA PLAINS"), "NA_L2NAME"] <- "SOUTHEASTERN PLAINS AND COAST"
  
   ## divide southeastern US plains into two regions, since it's by far the largest group
  # modDat_1_s[modDat_1_s$NA_L2NAME == "SOUTHEASTERN USA PLAINS" &
  #              modDat_1_s$Lat < -590062, "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS 1"
  # modDat_1_s[modDat_1_s$NA_L2NAME == "SOUTHEASTERN USA PLAINS", #&
  #             # modDat_1_s$Lat < -590062,
  #            "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS 2"

  #   ## divide southeastern US plains into two regions, since it's by far the largest group
  # modDat_1_s[modDat_1_s$NA_L2NAME == "OZARK/OUACHITA-APPALACHIAN FORESTS" &
  #              modDat_1_s$Long < 854862.2, "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS 1"
  # modDat_1_s[modDat_1_s$NA_L2NAME == "OZARK/OUACHITA-APPALACHIAN FORESTS" &
  #              modDat_1_s$Long  >=   854862.2, "NA_L2NAME"] <- "OZARK/OUACHITA-APPALACHIAN FORESTS 2"
}
## [1] "We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain,' 'Texas-Lousiana Coastal plain,' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)."
# make a table of n for each region
modDat_1_s %>% 
  group_by(NA_L2NAME) %>% 
  dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>% 
  rename("Level_2_Ecoregion" = NA_L2NAME)%>% 
  kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Level_2_Ecoregion Number_Of_Observations
COLD DESERTS 23979
MEDITERRANEAN PIEDMONT 1104
SEMIARID AND TEMPERATE PRAIRIES 741
WARM DESERTS 2813
WEST-CENTRAL SEMIARID PRAIRIES 3335

Then, look at the spatial distribution and environmental characteristics of the grouped ecoregions

map1 <- ggplot() +
  geom_sf(data=cropped_states,fill='white') +
  geom_sf(data=modDat_1_sf#[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
          ,
          aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
  geom_point(data=modDat_1_s#[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
             ,
             alpha=0.5, 
             aes(x = Long, y = Lat, color=as.factor(NA_L2NAME)), alpha = .1) +
  #scale_fill_okabeito() +
  #scale_color_okabeito() +
 # theme_default() +
  theme(legend.position = 'none') +
  labs(title = "Level 2 Ecoregions as spatial blocks")

hull <- modDat_1_sf %>%
  ungroup() %>%
  group_by(NA_L2NAME) %>%
  slice(chull(tmean, prcp))

plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
  geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
  geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
  theme_minimal() + xlab("Annual Average T_mean - long-term average") +
  ylab("Annual Average Precip - long-term average") #+
  #scale_color_okabeito() +
  #scale_fill_okabeito()

plot2<-ggplot(data=modDat_1_sf %>%
                pivot_longer(cols=tmean:prcp),
              aes(x=value,group=name)) +
  # geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
  geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
  theme_minimal() +
  facet_wrap(~name,scales='free')# +
  #scale_color_okabeito() +
  #scale_fill_okabeito()
 
library(patchwork)
(combo <- (map1+plot1)/plot2) 

# 
# ggplot(data = modDat_1_s) +
#   geom_density(aes(ShrubCover_transformed, col = NA_L2NAME)) +
#   xlim(c(0,100))

Fit a global model with all of the data

First, fit a LASSO regression model using the glmnet R package

  • This regression is a Gamma glm with a log link
  • Use cross validation across level 2 ecoregions to tune the lambda parameter in the LASSO model
  • this model is fit to using the scaled weather/climate/soils variables
  • this list of possible predictors includes:
    1. main effects
    2. interactions between all soils variables
    3. interactions between climate and weather variables
    4. transformed main effects (squared, log-transformed (add a uniform integer – 20– to all variables prior to log-transformation), square root-transformed (add a uniform integer – 20– to all variables prior to log-transformation))

Get rid of transformed predictions and interactions that are correlated

# get first pass of names correlated variables
X_df <- X %>% 
  as.data.frame() %>% 
  dplyr::select(-'(Intercept)')  
corrNames_i <- X_df %>% 
  cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
# remove those names that are untransformed main effects 
  # vector of columns to remove 
badNames <- corrNames_i[!(corrNames_i %in% prednames)]
while (sum(!(corrNames_i %in% prednames))>0) {
 corrNames_i <-  X_df %>% 
    dplyr::select(-badNames) %>% 
     cor()  %>% 
   caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
 # update the vector of names to remove 
 badNames <- unique(c(badNames, corrNames_i[!(corrNames_i %in% prednames)]))
}

## see if there are any correlated variables left (would be all main effects at this point)
# if there are, step through and remove the variable that each is most correlated with 
if (length(corrNames_i)>1) {
  for (i in 1:length(corrNames_i)) {
    X_i <- X_df %>% 
      dplyr::select(-badNames)
    if (corrNames_i[i] %in% names(X_i)) {
    corMat_i <- cor(x = X_i[corrNames_i[i]], y = X_i %>% dplyr::select(-corrNames_i[i])) 
    badNames_i <- colnames(corMat_i)[abs(corMat_i)>=.7]
    # if there are any predictors in the 'badNames_i', remove them from this list
    if (length(badNames_i) > 0 & sum(c(badNames_i %in% prednames))>0) {
        badNames_i <- badNames_i[!(badNames_i %in% prednames)]
    }
    badNames <- unique(c(badNames, badNames_i))
    }
  }
}
## update the X matrix to exclude these correlated variables
X <- X[,!(colnames(X) %in% badNames)]
if (autoKfold == FALSE) {
  # get the ecoregions for training lambda
  train_eco <- modDat_1_s$NA_L2NAME#[train]
  
  # Fit model -----------------------------------------------
  # specify leave-one-year-out cross-validation
  my_folds <- as.numeric(as.factor(train_eco))

  if (run == TRUE) {
    # set up parallel processing
    library(doMC)
    # this computer has 16 cores (parallel::detectCores())
    registerDoMC(cores = 8)
    
    fit <- cv.glmnet(
    x = X[,2:ncol(X)], 
    y = y, 
    family = stats::Gamma(link = "log"),
    keep = FALSE,
    alpha = 1,  # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
    #lambda = lambdas, 
    nlambda = 50,
    type.measure="mse",
    #penalty.factor = pen_facts,
    foldid = my_folds,
    #thresh = thresh,
    standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
    parallel = TRUE#, 
    #relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE)
    )
    base::saveRDS(fit, paste0("../ModelFitting/models/", response, "_globalLASSOmod_gammaLogLink_",ecoregion, "_noTLP_",removeTLP,".rds"))
  
  } else {
    fit <- readRDS(paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/", response, "_globalLASSOmod_gammaLogLink_",ecoregion, "_noTLP_",removeTLP,".rds"))
  }
  
} else if (autoKfold == TRUE) {
   if (run == TRUE) {
    # set up parallel processing
    library(doMC)
    # this computer has 16 cores (parallel::detectCores())
    registerDoMC(cores = 8)
    
    fit <- cv.glmnet(
    x = X[,2:ncol(X)], 
    y = y, 
    family = stats::Gamma(link = "log"),
    keep = FALSE,
    alpha = .5,  # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
    lambda = lambdas,
    relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE),
    #nlambda = 100,
    type.measure="mse",
    #penalty.factor = pen_facts,
    #foldid = my_folds,
    #thresh = thresh,
    standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
    parallel = TRUE
    )
    base::saveRDS(fit, paste0("../ModelFitting/models/", response, "_globalLASSOmod_gammaLogLink_",ecoregion, "_noTLP_",removeTLP,"_kFoldDefault.rds"))
  
  } else {
    fit <- readRDS(paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/", response, "_globalLASSOmod_gammaLogLink_",ecoregion, "_noTLP_",removeTLP,"_kFoldDefault.rds"))
  }
}

  # assess model fit
  # assess.glmnet(fit$fit.preval, #newx = X[,2:293], 
  #               newy = y, family = stats::Gamma(link = "log"))
  # save the minimum lambda
  best_lambda <- fit$lambda.min
  # save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
  lambda_1SE <- fit$lambda.1se
  # save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
  lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
 
  print(fit)     
## 
## Call:  cv.glmnet(x = X[, 2:ncol(X)], y = y, type.measure = "mse", foldid = my_folds,      keep = FALSE, parallel = TRUE, family = stats::Gamma(link = "log"),      alpha = 1, nlambda = 50, standardize = FALSE) 
## 
## Measure: Mean-Squared Error 
## 
##       Lambda Index Measure    SE Nonzero
## min 0.006736    19    1312 156.8      83
## 1se 0.014286    15    1460 190.6      50
  plot(fit)

Now, we need to do stability selection to ensure the coefficients that are being chosen with each lambda are stable

## stability selection for best lambda model 
# setup params
p <- ncol(X[,2:ncol(X)]) # of parameters
n <- length(y) # of observations
n_iter <- 100        # number of subsamples
sample_frac <- 0.75  # fraction of data to subsample
lambda_val <- best_lambda    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = stats::Gamma(link = "log"),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts[selected] <- selection_counts[selected] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob <- selection_counts / n_iter
selection_prob_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
bestLambda_coef <- selection_prob_df[selection_prob_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

#//////
# stability selection for 1se lambda model
lambda_val <-  lambda_1SE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = stats::Gamma(link = "log"),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts[selected] <- selection_counts[selected] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob <- selection_counts / n_iter
selection_prob_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
seLambda_coef <- selection_prob_df[selection_prob_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

#//////
# stability selection for half se lambda model
lambda_val <- lambda_halfSE    # fixed lambda value (could be chosen via CV)

# Track selection
selection_counts <- matrix(0, nrow = p, ncol = 1)

for (i in 1:n_iter) {
  # Subsample rows
  sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
  X_sub <- X[sample_idx, ]
  y_sub <- y[sample_idx]

  # Fit Lasso model
  fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub, 
    family = stats::Gamma(link = "log"),
    alpha = 1, lambda = lambda_val, standardize = FALSE)

  # Get non-zero coefficients (excluding intercept)
  selected <- which(as.vector(coef(fit_stab_i))[-1] != 0)
  selection_counts[selected] <- selection_counts[selected] + 1
}

# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob <- selection_counts / n_iter
selection_prob_df <- data.frame(
  VariableNumber = paste0("X", 1:p),
  VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
  SelectionProb = as.vector(selection_prob)
)

# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
halfseLambda_coef <- selection_prob_df[selection_prob_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]

If prompted to do so by the input arguments, remove any predictors that are for weather anomalies whose corresponding climate predictor is not included in the model

if (trimAnom == TRUE) {
  # get unique predictors
  bestLamb_all <- bestLambda_coef$VariableName %>% #[str_detect(bestLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  if (ncol(bestLamb_all) == 1) {
    bestLamb_all <- unique(bestLamb_all)
  } else {
    bestLamb_all <-  c(bestLamb_all[bestLamb_all[,1] !="",1], bestLamb_all[bestLamb_all[,2] !="",2]) %>% 
      unique()
  }
  # get anomalies
   bestLamb_anom <- bestLamb_all[bestLamb_all %in% prednames_weath]
  # get climate
   bestLamb_clim <- bestLamb_all[bestLamb_all %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   bestLamb_anomsWithMissingClim <- bestLamb_anom[!(str_remove(bestLamb_anom, "_anom") %in% bestLamb_clim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list

   if (length(bestLamb_anomsWithMissingClim) != 0) {
   
        bestLambda_coef_NEW <- bestLambda_coef
      for (i in 1:length(bestLamb_anomsWithMissingClim)) {
     bestLambda_coef_NEW <- bestLambda_coef_NEW[!str_detect(bestLambda_coef_NEW$VariableName, pattern = bestLamb_anomsWithMissingClim[i]),]
      }
     
   bestLambda_coef <- bestLambda_coef_NEW
   }
   
   
  #//// 1 se lambda model
      if (nrow(seLambda_coef) !=0) {
        # get unique predictors
  seLamb_all <- seLambda_coef$VariableName %>% #[str_detect(seLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "_s\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  if (ncol(seLamb_all) == 1) {
    seLamb_all <- unique(seLamb_all)
  } else {
    seLamb_all <-  c(seLamb_all[seLamb_all[,1] !="",1], seLamb_all[seLamb_all[,2] !="",2]) %>% 
      unique()
  }
  # get anomalies
   seLamb_anom <- seLamb_all[seLamb_all %in% prednames_weath]
  # get climate
   seLamb_clim <- seLamb_all[seLamb_all %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   seLamb_anomsWithMissingClim <- seLamb_anom[!(str_remove(seLamb_anom, "_anom") %in%  seLamb_clim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list
      if (length(seLamb_anomsWithMissingClim) != 0) {
    seLambda_coef_NEW <- seLambda_coef
   for (i in 1:length(seLamb_anomsWithMissingClim)) {
     seLambda_coef_NEW <- seLambda_coef_NEW[!str_detect(seLambda_coef_NEW$VariableName, pattern = seLamb_anomsWithMissingClim[i]),]
   }
   seLambda_coef <- seLambda_coef_NEW
      }
   }
  
  #//// 1 se lambda model
      if (nrow(halfseLambda_coef) !=0) {
        # get unique predictors
  halfseLamball<- halfseLambda_coef$VariableName %>% #[str_detect(halfseLambda_coef$VariableName, pattern = "_anom_s")] %>% 
    str_remove(pattern = "I\\(") %>% 
    str_remove(pattern = "_s\\^2\\)") %>% 
    str_split(pattern = ":", simplify = TRUE) 
  
    if (ncol(halfseLamball) == 1) {
    halfseLamball <- unique(halfseLamball)
  } else {
    halfseLamball <-  c(halfseLamball[halfseLamball[,1] !="",1], halfseLamball[halfseLamball[,2] !="",2]) %>% 
      unique()
  }
  
  # get anomalies
   halfseLambanom <- halfseLamball[halfseLamball %in% prednames_weath]
  # get climate
   halfseLambclim <- halfseLamball[halfseLamball %in% prednames_clim]
  # which anomalies are present in the predictors, but their corresponding climate variable isn't
   halfseLambanomsWithMissingClim <- halfseLambanom[!(str_remove(halfseLambanom, "_anom_s") %in%  halfseLambclim)]
   # remove anomalies (and all interaction terms w/ those anomalies) from the predictor list

   
   ##
        if (length(halfseLambanomsWithMissingClim) != 0) {
   halfseLambda_coef_NEW <- halfseLambda_coef
   for (i in 1:length(halfseLambanomsWithMissingClim)) {
     halfseLambda_coef_NEW <- halfseLambda_coef_NEW[!str_detect(halfseLambda_coef_NEW$VariableName, pattern = halfseLambanomsWithMissingClim[i]),]
   }
   halfseLambda_coef <- halfseLambda_coef_NEW
      }
   }
    ##
  
      }

Then, fit regular glm models (Gamma glm with a log link), first using the coefficients from the ‘best’ lambda identified in the LASSO model, and then using the coefficients from the ‘1SE’ lambda and then the ‘1/2SE’ lambda values identified from the LASSO (this is the value of lambda such that the cross validation error is within 1 standard error of the minimum).

## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
  mat_glmnet_best <- bestLambda_coef$VariableName 

  if (length(mat_glmnet_best) == 0) {
    f_glm_bestLambda <- as.formula(paste0(response_t, "~ 1"))
  } else {
  f_glm_bestLambda <- as.formula(paste0(response_t, " ~ ", paste0(mat_glmnet_best, collapse = " + ")))
  }
  
  fit_glm_bestLambda <- glm(data = modDat_1_s
                              , formula =  f_glm_bestLambda, family =  stats::Gamma(link = "log"))
  
   ## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
  mat_glmnet_1se <- seLambda_coef$VariableName
    
  if (length(mat_glmnet_1se) == 0) {
    f_glm_1se <- as.formula(paste0(response_t, "~ 1"))
  } else {
  f_glm_1se <- as.formula(paste0(response_t, " ~ ", paste0(mat_glmnet_1se, collapse = " + ")))
  }


  fit_glm_se <- glm(data = modDat_1_s, formula = f_glm_1se,
                    family =  stats::Gamma(link = "log"))
  
     ## fit w/ the identified coefficients from the '.5se' lambda, but using the glm function
  mat_glmnet_halfse <- halfseLambda_coef$VariableName
  
  if (length(mat_glmnet_halfse) == 0) {
    f_glm_halfse <- as.formula(paste0(response_t, "~ 1"))
  } else {
  f_glm_halfse <- as.formula(paste0(response_t, " ~ ", paste0(mat_glmnet_halfse, collapse = " + ")))
  }

  fit_glm_halfse <- glm(data = modDat_1_s, formula =  f_glm_halfse,
                    family =  stats::Gamma(link = "log"))
  
  ## save models 
  if (trimAnom == TRUE) {
    saveRDS(fit_glm_bestLambda, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_trimAnom_bestLambdaGLM.rds"))
  saveRDS(fit_glm_halfse, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_trimAnom_halfSELambdaGLM.rds"))
  saveRDS(fit_glm_se, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_trimAnom_oneSELambdaGLM.rds"))
  } else {
    saveRDS(fit_glm_bestLambda, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_bestLambdaGLM.rds"))
  saveRDS(fit_glm_halfse, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_halfSELambdaGLM.rds"))
  saveRDS(fit_glm_se, paste0("/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Analysis/VegComposition/ModelFitting/models/",response,"_",ecoregion, "_noTLP_",removeTLP,"_oneSELambdaGLM.rds"))
    
  }

Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)

  ## predict on the test data
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response") - 2
  optimal_pred_1se <-  predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response") - 2
  optimal_pred_halfse <- predict(fit_glm_halfse, newx = X[,2:ncol(X)], type = "response") - 2
  
    null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]), 
      formula = y ~ 1, family = stats::Gamma(link = "log"))
  null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
                       ) - 2

  ## snap values above 100 and below 0 to be 100 or z
  optimal_pred[optimal_pred>100] <- 100
  optimal_pred[optimal_pred<0] <- 0
  optimal_pred_1se[optimal_pred_1se>100] <- 100
  optimal_pred_1se[optimal_pred_1se<0] <- 0
  optimal_pred_halfse[optimal_pred_halfse>100] <- 100
  optimal_pred_halfse[optimal_pred_halfse<0] <- 0
  
  # save data
  fullModOut <- list(
    "modelObject" = fit,
    "nullModelObject" = null_fit,
    "modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
      obs=y-2,
                    pred_opt=optimal_pred, 
                    pred_opt_se = optimal_pred_1se,
                    pred_opt_halfse = optimal_pred_halfse,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ))
  
# # calculate correlations between null and optimal model 
# my_cors <- c(cor(optimal_pred, c(y)),
#              cor(optimal_pred_1se, c(y)), 
#             cor(null_pred, c(y))
#             )
# 
# # calculate mse between null and optimal model 
# my_mse <- c(mean((fullModOut$modelPredictions$pred_opt -  c(y))^2) ,
#             mean((fullModOut$modelPredictions$pred_opt_se -  c(y))^2) ,
#             mean((fullModOut$modelPredictions$pred_null - c(y))^2)#,
#             #mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
#             )

ggplot() + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_halfse), col = "orange", alpha = .1) + ## predictions w/ the CV model (.5se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) + 
  labs(title = "A rough comparison of observed and model-predicted values", 
       subtitle = "black = observed values \n red = predictions from 'best lambda' model \n orange = predictions for '1/2se' lambda model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
  xlab(colnames(X)[2])

  #ylim(c(0,200))

The internal cross-validation process to fit the global LASSO model identified an optimal lambda value (regularization parameter) of r{print(best_lambda)}. The lambda value such that the cross validation error is within 1 standard error of the minimum (“1se lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients were kept in each model:

# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>% 
  data.frame() 
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model 
coef_glm_1se <- coef(fit_glm_se) %>% 
  data.frame() 
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# coefficient matrix from the 'half se' model 
coef_glm_halfse <- coef(fit_glm_halfse) %>% 
  data.frame() 
coef_glm_halfse$coefficientName <- rownames(coef_glm_halfse)
names(coef_glm_halfse)[1] <- "coefficientValue_halfseLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_halfse) %>% 
  full_join(coef_glm_1se) %>% 
  dplyr::select(coefficientName, coefficientValue_bestLambda,
                coefficientValue_halfseLambda, coefficientValue_1seLambda)

globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]

## also, get the number of unique variables in each model 
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE)) 
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# for 1/2SE lambda model
globModTerms_halfse <- coefs[!is.na(coefs$coefficientValue_halfseLambda), "coefficientName"]
if (length(globModTerms_halfse) == 1) {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- c(0)
} else {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE)) 
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- length(prednames_fig_halfse)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", 
                           "Value form 1/2 se lambda", "Value from 1se lambda model")
      ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Coefficient Name Value from best lambda model Value form 1/2 se lambda Value from 1se lambda model
(Intercept) 3.8788299 4.0117685 3.9816864
tmean -0.2563057 -0.2934945 -0.3071961
prcp -0.1622070 NA NA
prcp_seasonality -0.0642158 NA NA
prcpTempCorr -0.4342954 -0.4207701 -0.4502603
isothermality -0.3255998 -0.3156178 -0.2905802
sand 0.0300255 NA NA
coarse 0.0393705 0.0449991 0.0376145
tmax_anom 0.0672472 0.0601392 0.0588148
t_warm_anom 0.0010826 0.0093030 0.0096425
prcp_wet_anom -0.0247251 -0.0170910 -0.0152670
precp_dry_anom -0.0126853 NA NA
prcpTempCorr_anom 0.0329903 0.0325631 0.0219003
isothermality_anom -0.0707644 -0.0583357 -0.0790918
annWetDegDays_anom -0.0485856 -0.0480559 -0.0442154
VPD_mean_anom -0.0678166 -0.0618866 NA
VPD_min_anom 0.0990644 0.0969773 0.0677681
frostFreeDays_5_anom 0.0244200 0.0161478 0.0117170
I(prcpTempCorr^2) -0.1023167 -0.1414389 -0.1481062
I(isothermality^2) -0.0944048 -0.1072083 -0.1034817
I(t_warm_anom^2) -0.0213605 -0.0181890 -0.0140426
I(prcp_seasonality_anom^2) -0.0050316 -0.0092860 -0.0066448
I(aboveFreezingMonth_anom^2) -0.0142466 -0.0088829 NA
I(isothermality_anom^2) -0.0193174 -0.0131486 -0.0271670
I(annWetDegDays_anom^2) 0.0124079 NA NA
I(sand^2) -0.0186032 -0.0268311 -0.0249093
I(coarse^2) -0.0451831 -0.0346924 NA
annWatDef:aboveFreezingMonth_anom 0.0422586 NA NA
annWetDegDays_anom:aboveFreezingMonth_anom -0.0162144 NA NA
aboveFreezingMonth_anom:prcp_seasonality_anom -0.0198029 NA NA
prcpTempCorr:aboveFreezingMonth_anom -0.0230777 -0.0310339 NA
VPD_min_anom:aboveFreezingMonth_anom 0.0395599 0.0388454 0.0400276
VPD_min_anom:annWatDef_anom 0.0488996 NA NA
isothermality_anom:annWetDegDays_anom 0.0483294 0.0249744 0.0276404
annWetDegDays_anom:prcp_seasonality_anom 0.0259795 NA NA
prcp_wet_anom:annWetDegDays_anom -0.0252918 NA NA
precp_dry_anom:annWetDegDays_anom -0.0116342 NA NA
tmax_anom:annWetDegDays_anom -0.0083228 0.0072592 0.0091096
annWetDegDays_anom:tmin_anom 0.0160791 NA NA
isothermality:frostFreeDays_5_anom 0.0270591 0.0250806 NA
isothermality_anom:frostFreeDays_5_anom 0.0061565 0.0029608 NA
tmax_anom:frostFreeDays_5_anom 0.0461756 0.0478671 NA
isothermality:isothermality_anom -0.0623391 -0.0689897 -0.0495204
prcp_seasonality:isothermality -0.0324665 NA NA
isothermality:prcp_seasonality_anom -0.0401169 -0.0319767 NA
isothermality:prcp_wet_anom 0.0097445 NA NA
prcpTempCorr:isothermality -0.2699229 -0.2684511 -0.2846895
isothermality:precp_dry_anom 0.0082132 0.0077661 0.0211527
isothermality:VPD_mean_anom -0.1413961 -0.1463436 -0.0687071
isothermality:VPD_min_anom 0.1332360 0.1309971 NA
prcp_seasonality:isothermality_anom -0.0398450 NA NA
prcpTempCorr:isothermality_anom -0.1092717 -0.1142790 -0.1225192
prcpTempCorr_anom:isothermality_anom -0.0019338 NA NA
isothermality_anom:VPD_mean_anom -0.0318589 NA NA
isothermality_anom:VPD_min_anom 0.0449291 NA NA
prcp_seasonality:prcp_seasonality_anom -0.0130152 -0.0090125 -0.0202969
prcp_seasonality:prcpTempCorr -0.1976286 NA NA
prcp_seasonality:tmax_anom -0.0080422 NA NA
prcp_seasonality:tmin_anom -0.0484966 NA NA
prcpTempCorr_anom:prcp_seasonality_anom -0.0195718 -0.0197971 NA
VPD_mean_anom:prcp_seasonality_anom -0.0487747 -0.0315410 -0.0372806
prcpTempCorr:prcp_wet_anom -0.0359219 -0.0446937 -0.0328289
prcp_wet_anom:prcpTempCorr_anom 0.0016391 -0.0004596 -0.0105452
t_warm_anom:prcp_wet_anom -0.0154197 NA NA
prcpTempCorr:prcpTempCorr_anom 0.0353542 NA NA
prcpTempCorr:tmax_anom -0.0173999 NA NA
prcpTempCorr:VPD_mean_anom -0.1295057 -0.1346867 -0.1046516
prcpTempCorr:VPD_min_anom 0.1622269 0.1717036 0.1365087
t_warm_anom:prcpTempCorr_anom 0.0402362 0.0255693 0.0220312
tmax_anom:prcpTempCorr_anom 0.0089892 0.0132213 0.0085666
prcpTempCorr_anom:VPD_mean_anom -0.0258744 -0.0312854 NA
tmax_anom:precp_dry_anom 0.0146702 0.0122264 NA
precp_dry_anom:VPD_mean_anom -0.0147361 NA NA
t_warm_anom:tmin_anom -0.0314015 -0.0290235 NA
tmax_anom:VPD_mean_anom -0.0113656 -0.0183383 -0.0069665
tmean:VPD_mean_anom -0.0402428 -0.0676378 -0.0664706
VPD_mean_anom:VPD_min_anom -0.0220422 -0.0177440 NA
sand:AWHC 0.0688334 0.0589283 0.0583880
sand:coarse 0.1116684 NA NA
I(AWHC^2) NA 0.0008773 -0.0025647
annWetDegDays_anom:precp_dry_anom NA -0.0101314 -0.0063892
isothermality:prcp_seasonality NA -0.0455494 -0.0771423
isothermality_anom:tmin_anom NA 0.0134360 NA
prcpTempCorr:prcp_seasonality NA -0.2023501 -0.2127776
tmax_anom:prcp_seasonality NA -0.0406537 -0.0490257
coarse:sand NA 0.0860816 0.0682180
# calculate RMSE of all models 
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")], truth = "obs", estimate = "pred_opt")$.estimate
RMSE_halfse <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_halfse")], truth = "obs", estimate = "pred_opt_halfse")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")], truth = "obs", estimate = "pred_opt_se")$.estimate
# calculate bias of all models
bias_best <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt)
bias_halfse <-  mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_halfse)
bias_1se <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_se)

uniqueCoeffs <- data.frame("Best lambda model" = c(RMSE_best, bias_best,
  as.integer(length(globModTerms)-1), as.integer(prednames_fig_num), 
                                                   as.integer(sum(prednames_fig %in% c(prednames_clim))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_weath))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_soils)))
                                                   ), 
                           "1/2 se lambda model" = c(RMSE_halfse, bias_halfse,
                             length(globModTerms_halfse)-1, prednames_fig_halfse_num,
                                                   sum(prednames_fig_halfse %in% c(prednames_clim)),
                                                   sum(prednames_fig_halfse %in% c(prednames_weath)),
                                                   sum(prednames_fig_halfse %in% c(prednames_soils))), 
                           "1se lambda model" = c(RMSE_1se, bias_1se,
                             length(globModTerms_1se)-1, prednames_fig_1se_num,
                                                   sum(prednames_fig_1se %in% c(prednames_clim)),
                                                   sum(prednames_fig_1se %in% c(prednames_weath)),
                                                   sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias: mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
                             "Number of unique climate coefficients", 
                             "Number of unique weather coefficients",  
                             "Number of unique soils coefficients"
                             )

kable(uniqueCoeffs, 
      col.names = c("Best lambda model", "1/2 se lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Best lambda model 1/2 se lambda model 1se lambda model
RMSE 23.4823519 23.7707372 24.1219723
bias: mean(obs-pred.) -0.4286075 -0.4346914 -0.5221771
Total number of coefficients 78.0000000 58.0000000 43.0000000
Number of unique coefficients 23.0000000 20.0000000 19.0000000
Number of unique climate coefficients 6.0000000 4.0000000 4.0000000
Number of unique weather coefficients 14.0000000 13.0000000 12.0000000
Number of unique soils coefficients 3.0000000 3.0000000 3.0000000

Visualizations of Model Predictions and Residuals – using best lambda model

observed vs. predicted values

If the 1se lambda has no terms (is an intercept only model), use the 1/2 se lambda in subsequent figures

if (length(prednames_fig_1se) == 0) {
  mod_secondBest <- fit_glm_halfse
  name_secondBestMod <- "1/2 SE Model"
  prednames_secondBestMod <- prednames_fig_halfse
} else {
  mod_secondBest <- fit_glm_se
  name_secondBestMod <- "1 SE Model"
  prednames_secondBestMod <- prednames_fig_1se
}

Predicting on the data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df
  response_name <- paste0(response, "_pred")
  preds <- predict(mod, newx= df_out, #s="lambda.min", 
                                     type = "response")-2
  preds[preds<0] <- 0
  preds[preds>100] <- 100
  df_out <- df_out %>% cbind(preds)
   colnames(df_out)[ncol(df_out)] <- response_name
  return(df_out)
}

pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])

## back-transform the 
# add back in true y values
pred_glm1 <- pred_glm1 %>% 
  cbind(modDat_1_s[,c(response, response_t)])

# add back in lat/long data 
pred_glm1 <- pred_glm1 %>% 
  cbind(modDat_1_s[,c("Long", "Lat", "Year")])

pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 70 | pred_glm1$resid < -70,"extremeResid"] <- 1

# plot(x = pred_glm1[,response],
#      y = pred_glm1[,paste0(response, "_pred")],
#      xlab = "observed values", ylab = "predicted values")
# points(x = pred_glm1[!is.na(pred_glm1$extremeResid), response],
#        y = pred_glm1[!is.na(pred_glm1$extremeResid), paste0(response, "_pred")],
#        col = "red")
pred_glm1_1se <- predict_by_response(mod_secondBest, X[,2:ncol(X)])

# add back in true y values
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind(modDat_1_s[,c(response, response_t)])

# add back in lat/long data 
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind(modDat_1_s[,c("Long", "Lat", "Year")])

pred_glm1_1se$resid <- pred_glm1_1se[,response] - pred_glm1_1se[,paste0(response, "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > 70 | pred_glm1_1se$resid < -70,"extremeResid"] <- 1

Maps of Observations, Predictions, and Residuals=

Observations across the temporal range of the dataset

pred_glm1 <- pred_glm1 %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize
# get reference raster
test_rast <-  rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>% 
  terra::aggregate(fact = 8, fun = "mean")
## |---------|---------|---------|---------|=========================================                                          
## add ecoregion boundaries (for our ecoregion level model)
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2") 
## Reading layer `NA_CEC_Eco_Level2' from data source 
##   `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Data_raw/Level2Ecoregions' using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>% 
  st_transform(crs = st_crs(test_rast)) %>% 
  st_make_valid() #%>% 
  #st_crop(st_bbox(test_rast))
# 
# goodRegions_temp <- st_overlaps(y = cropped_states, x = regions, sparse = FALSE) %>% 
#   rowSums() 
# goodRegions <- regions[goodRegions_temp != 0,]

ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)), 
                        "newRegion" = c(NA, "Forest", "dryShrubGrass", 
                                        "dryShrubGrass", "Forest", "dryShrubGrass",
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "Forest", "Forest", "dryShrubGrass", 
                                       NA
                                        ))
goodRegions <- regions %>% 
  left_join(ecoregionLU)
mapRegions <- goodRegions %>% 
  filter(!is.na(newRegion)) %>% 
  group_by(newRegion) %>% 
  summarise(geometry = sf::st_union(geometry)) %>% 
  ungroup() %>% 
  st_simplify(dTolerance = 1000)
#mapview(mapRegions)
# rasterize data
plotObs <- pred_glm1 %>% 
         drop_na(paste(response)) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = response, 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
map_obs <- ggplot() +
geom_spatraster(data = plotObs_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "darkgreen" , 
                       midpoint = 0,   na.value = "grey20") + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

hist_obs <- ggplot(pred_glm1) + 
  geom_histogram(aes(.data[[response]]), fill = "lightgrey", col = "darkgrey")

library(ggpubr)
ggarrange(map_obs, hist_obs, heights = c(3,1), ncol = 1)

Predictions across the temporal range of the dataset

# rasterize data
plotPred <- pred_glm1 %>% 
         drop_na(paste0(response,"_pred")) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = paste0(response,"_pred"), 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the point location of those predictions that are > 100
highPred_points <- pred_glm1 %>% 
  filter(.[[paste0(response,"_pred")]] > 100 | 
           .[[paste0(response, "_pred")]] < 0) %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)

plotPred_2 <- plotPred %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
map_preds1 <- ggplot() +
geom_spatraster(data = plotPred_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the 'best lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 100,   na.value = "grey20",
                       limits = c(0,100)) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

hist_preds1 <- ggplot(pred_glm1) + 
  geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+ 
  xlim(c(0,100))

ggarrange(map_preds1, hist_preds1, heights = c(3,1), ncol = 1)

# rasterize data
plotPred <- pred_glm1_1se %>% 
         drop_na(paste0(response,"_pred")) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = paste0(response,"_pred"), 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the point location of those predictions that are > 100
highPred_points <- pred_glm1_1se %>% 
  filter(.[[paste0(response,"_pred")]] > 100 | 
           .[[paste0(response, "_pred")]] < 0) %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)

plotPred_2 <- plotPred %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
map_preds2 <- ggplot() +
geom_spatraster(data = plotPred_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the '1SE lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  name_secondBestMod)  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 100,   na.value = "grey20",
                       limits = c(0,100)) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

hist_preds2 <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+ 
  xlim(c(0,100))

ggarrange(map_preds2, hist_preds2, heights = c(3,1), ncol = 1)

Residuals across the entire temporal extent of the dataset

# rasterize data
plotResid_rast <- pred_glm1 %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)

plotResid_rast_2 <- plotResid_rast %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- pred_glm1 %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
     subtitle = "bestLambda model \n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-100,100)
                       ) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2)))) + 
  geom_vline(aes(xintercept = mean(resid)))

ggarrange(map, hist, heights = c(3,1), ncol = 1)

# rasterize data
plotResid_rast <- pred_glm1_1se %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)

plotResid_rast_2 <- plotResid_rast %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1_1se %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- pred_glm1_1se %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
     subtitle = paste0(name_secondBestMod,"\n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100")) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-100,100)
                       ) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))+ 
  geom_vline(aes(xintercept = mean(resid)))

ggarrange(map, hist, heights = c(3,1), ncol = 1)

Are there biases of the model predictions across year/lat/long?

# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = Lat, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Lat, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = Lat, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Lat, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = Long, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Long, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residuals") +
  ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = Long, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Long, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residuals") +
  ggtitle(paste0("from ", name_secondBestMod))

library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) / 
(  latResidMod_bestLambda + latResidMod_1seLambda) /
(  longResidMod_bestLambda + longResidMod_1seLambda)

Quantile plots

Binning predictor variables into “Deciles” (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word Decentiles is just a legacy thing (they started out being actual Percentiles)

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig, 
                                       cut_points = seq(0, 1, 0.005))
}
# get deciles for 1 SE lambda model 
if (length(prednames_secondBestMod) == 0) {
  print("The 1SE (or 1/2 SE) lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
                                      response_vars = response_vars,
                                        pred_vars = prednames_secondBestMod, 
                                       cut_points = seq(0, 1, 0.005))
}

Here is a quick version of LOESS curves fit to raw data (to double-check the quantile plot calculations)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1 %>%
  dplyr::select(all_of(c(prednames_fig, response_vars))) %>%
  pivot_longer(cols = prednames_fig)  %>%
  ggplot() +
  facet_wrap(~name, scales = "free") +
  geom_point(aes(x = value, y =  .data[[paste(response)]]), col = "darkblue", alpha = .1)  + # observed values
  geom_point(aes(x = value, y = .data[[response_vars[2]]]), col = "lightblue", alpha = .1) + # model-predicted values
  geom_smooth(aes(x = value, y =  .data[[paste(response)]]), col = "black", se = FALSE) +
  geom_smooth(aes(x = value, y = .data[[response_vars[2]]]), col = "brown", se = FALSE) + 
  ggtitle(label = "dark blue points = observations; 
          light blue points = predictions;
          black line = observations;
          brown line = predictions")

}

Below are the actual quantile plots (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {

# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = response, IQR = TRUE,
                        CI = FALSE
                        ) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = response, dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g4)
  dev.off()
}

g4
}

if (length(prednames_secondBestMod) == 0) {
  print("The 1 se lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")

  } else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = response, IQR = TRUE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = response, dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g4)
  dev.off()
}

g4
}

Deciles Filtered

20th and 80th percentiles for each climate variable

df <- pred_glm1[, prednames_fig] #%>% 
  #mutate(MAT = MAT - 273.15) # k to c
quantiles <- purrr::map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)

Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each predictor variable.

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_var = TRUE,
                         filter_vars = prednames_fig,
                         cut_points = seq(0, 1, 0.005)) 

decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = prednames_fig#, response = response
                           )
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)

}
## Processed 10375 groups out of 196697. 5% done. Time elapsed: 3s. ETA: 53s.Processed 12619 groups out of 196697. 6% done. Time elapsed: 4s. ETA: 69s.Processed 17189 groups out of 196697. 9% done. Time elapsed: 5s. ETA: 60s.Processed 21849 groups out of 196697. 11% done. Time elapsed: 6s. ETA: 54s.Processed 26505 groups out of 196697. 13% done. Time elapsed: 7s. ETA: 49s.Processed 31014 groups out of 196697. 16% done. Time elapsed: 8s. ETA: 46s.Processed 35630 groups out of 196697. 18% done. Time elapsed: 9s. ETA: 44s.Processed 40122 groups out of 196697. 20% done. Time elapsed: 10s. ETA: 41s.Processed 44796 groups out of 196697. 23% done. Time elapsed: 11s. ETA: 39s.Processed 49165 groups out of 196697. 25% done. Time elapsed: 12s. ETA: 38s.Processed 53794 groups out of 196697. 27% done. Time elapsed: 13s. ETA: 36s.Processed 58396 groups out of 196697. 30% done. Time elapsed: 14s. ETA: 34s.Processed 63043 groups out of 196697. 32% done. Time elapsed: 15s. ETA: 33s.Processed 67443 groups out of 196697. 34% done. Time elapsed: 16s. ETA: 32s.Processed 71904 groups out of 196697. 37% done. Time elapsed: 17s. ETA: 30s.Processed 76378 groups out of 196697. 39% done. Time elapsed: 18s. ETA: 29s.Processed 81024 groups out of 196697. 41% done. Time elapsed: 19s. ETA: 28s.Processed 85343 groups out of 196697. 43% done. Time elapsed: 20s. ETA: 27s.Processed 89912 groups out of 196697. 46% done. Time elapsed: 21s. ETA: 25s.Processed 94520 groups out of 196697. 48% done. Time elapsed: 22s. ETA: 24s.Processed 99124 groups out of 196697. 50% done. Time elapsed: 23s. ETA: 23s.Processed 103514 groups out of 196697. 53% done. Time elapsed: 24s. ETA: 22s.Processed 108064 groups out of 196697. 55% done. Time elapsed: 25s. ETA: 21s.Processed 112702 groups out of 196697. 57% done. Time elapsed: 26s. ETA: 19s.Processed 117338 groups out of 196697. 60% done. Time elapsed: 27s. ETA: 18s.Processed 121953 groups out of 196697. 62% done. Time elapsed: 28s. ETA: 17s.Processed 126599 groups out of 196697. 64% done. Time elapsed: 29s. ETA: 16s.Processed 130947 groups out of 196697. 67% done. Time elapsed: 30s. ETA: 15s.Processed 135316 groups out of 196697. 69% done. Time elapsed: 31s. ETA: 14s.Processed 139967 groups out of 196697. 71% done. Time elapsed: 32s. ETA: 13s.Processed 144584 groups out of 196697. 74% done. Time elapsed: 33s. ETA: 12s.Processed 149152 groups out of 196697. 76% done. Time elapsed: 34s. ETA: 11s.Processed 153708 groups out of 196697. 78% done. Time elapsed: 35s. ETA: 10s.Processed 158336 groups out of 196697. 80% done. Time elapsed: 36s. ETA: 8s.Processed 162761 groups out of 196697. 83% done. Time elapsed: 37s. ETA: 7s.Processed 167440 groups out of 196697. 85% done. Time elapsed: 38s. ETA: 6s.Processed 172121 groups out of 196697. 88% done. Time elapsed: 39s. ETA: 5s.Processed 176766 groups out of 196697. 90% done. Time elapsed: 40s. ETA: 4s.Processed 181249 groups out of 196697. 92% done. Time elapsed: 41s. ETA: 3s.Processed 185751 groups out of 196697. 94% done. Time elapsed: 42s. ETA: 2s.Processed 190377 groups out of 196697. 97% done. Time elapsed: 43s. ETA: 1s.Processed 194406 groups out of 196697. 99% done. Time elapsed: 45s. ETA: 0s.Processed 196697 groups out of 196697. 100% done. Time elapsed: 45s. ETA: 0s.

Filtered quantile figure with middle 2 deciles also shown

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_vars = prednames_fig,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.005))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, #response = response, 
                                xvars = prednames_fig)
g

if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
     units = "in", res = 600, width = 5.5, height = 6 )
  g 
dev.off()
}
}
## Processed 11008 groups out of 294990. 4% done. Time elapsed: 3s. ETA: 77s.Processed 14498 groups out of 294990. 5% done. Time elapsed: 4s. ETA: 77s.Processed 17650 groups out of 294990. 6% done. Time elapsed: 5s. ETA: 78s.Processed 21491 groups out of 294990. 7% done. Time elapsed: 6s. ETA: 76s.Processed 25374 groups out of 294990. 9% done. Time elapsed: 7s. ETA: 74s.Processed 29128 groups out of 294990. 10% done. Time elapsed: 8s. ETA: 73s.Processed 32979 groups out of 294990. 11% done. Time elapsed: 9s. ETA: 72s.Processed 36862 groups out of 294990. 12% done. Time elapsed: 10s. ETA: 70s.Processed 40604 groups out of 294990. 14% done. Time elapsed: 11s. ETA: 69s.Processed 44421 groups out of 294990. 15% done. Time elapsed: 12s. ETA: 68s.Processed 48024 groups out of 294990. 16% done. Time elapsed: 13s. ETA: 67s.Processed 51628 groups out of 294990. 18% done. Time elapsed: 14s. ETA: 66s.Processed 55507 groups out of 294990. 19% done. Time elapsed: 15s. ETA: 65s.Processed 57932 groups out of 294990. 20% done. Time elapsed: 16s. ETA: 66s.Processed 60273 groups out of 294990. 20% done. Time elapsed: 17s. ETA: 66s.Processed 62433 groups out of 294990. 21% done. Time elapsed: 18s. ETA: 67s.Processed 65881 groups out of 294990. 22% done. Time elapsed: 19s. ETA: 66s.Processed 69621 groups out of 294990. 24% done. Time elapsed: 20s. ETA: 65s.Processed 72744 groups out of 294990. 25% done. Time elapsed: 21s. ETA: 64s.Processed 76473 groups out of 294990. 26% done. Time elapsed: 22s. ETA: 63s.Processed 80208 groups out of 294990. 27% done. Time elapsed: 23s. ETA: 61s.Processed 83765 groups out of 294990. 28% done. Time elapsed: 24s. ETA: 60s.Processed 87051 groups out of 294990. 30% done. Time elapsed: 25s. ETA: 60s.Processed 90654 groups out of 294990. 31% done. Time elapsed: 26s. ETA: 58s.Processed 94446 groups out of 294990. 32% done. Time elapsed: 27s. ETA: 57s.Processed 97993 groups out of 294990. 33% done. Time elapsed: 28s. ETA: 56s.Processed 101664 groups out of 294990. 34% done. Time elapsed: 29s. ETA: 55s.Processed 105062 groups out of 294990. 36% done. Time elapsed: 30s. ETA: 54s.Processed 108779 groups out of 294990. 37% done. Time elapsed: 31s. ETA: 53s.Processed 112532 groups out of 294990. 38% done. Time elapsed: 32s. ETA: 52s.Processed 114063 groups out of 294990. 39% done. Time elapsed: 33s. ETA: 53s.Processed 118423 groups out of 294990. 40% done. Time elapsed: 34s. ETA: 51s.Processed 122625 groups out of 294990. 42% done. Time elapsed: 35s. ETA: 49s.Processed 127045 groups out of 294990. 43% done. Time elapsed: 36s. ETA: 48s.Processed 131510 groups out of 294990. 45% done. Time elapsed: 37s. ETA: 46s.Processed 135976 groups out of 294990. 46% done. Time elapsed: 38s. ETA: 44s.Processed 140476 groups out of 294990. 48% done. Time elapsed: 39s. ETA: 43s.Processed 144760 groups out of 294990. 49% done. Time elapsed: 40s. ETA: 41s.Processed 149278 groups out of 294990. 51% done. Time elapsed: 41s. ETA: 40s.Processed 153830 groups out of 294990. 52% done. Time elapsed: 42s. ETA: 38s.Processed 158413 groups out of 294990. 54% done. Time elapsed: 43s. ETA: 37s.Processed 162873 groups out of 294990. 55% done. Time elapsed: 44s. ETA: 36s.Processed 167262 groups out of 294990. 57% done. Time elapsed: 45s. ETA: 34s.Processed 171702 groups out of 294990. 58% done. Time elapsed: 46s. ETA: 33s.Processed 176086 groups out of 294990. 60% done. Time elapsed: 47s. ETA: 32s.Processed 180633 groups out of 294990. 61% done. Time elapsed: 48s. ETA: 30s.Processed 185153 groups out of 294990. 63% done. Time elapsed: 49s. ETA: 29s.Processed 189468 groups out of 294990. 64% done. Time elapsed: 50s. ETA: 28s.Processed 194016 groups out of 294990. 66% done. Time elapsed: 51s. ETA: 26s.Processed 198539 groups out of 294990. 67% done. Time elapsed: 52s. ETA: 25s.Processed 203051 groups out of 294990. 69% done. Time elapsed: 53s. ETA: 24s.Processed 207551 groups out of 294990. 70% done. Time elapsed: 54s. ETA: 22s.Processed 212062 groups out of 294990. 72% done. Time elapsed: 55s. ETA: 21s.Processed 216729 groups out of 294990. 73% done. Time elapsed: 56s. ETA: 20s.Processed 221082 groups out of 294990. 75% done. Time elapsed: 57s. ETA: 19s.Processed 225538 groups out of 294990. 76% done. Time elapsed: 58s. ETA: 18s.Processed 229875 groups out of 294990. 78% done. Time elapsed: 59s. ETA: 16s.Processed 234292 groups out of 294990. 79% done. Time elapsed: 60s. ETA: 15s.Processed 238820 groups out of 294990. 81% done. Time elapsed: 61s. ETA: 14s.Processed 243364 groups out of 294990. 82% done. Time elapsed: 62s. ETA: 13s.Processed 247912 groups out of 294990. 84% done. Time elapsed: 63s. ETA: 12s.Processed 252224 groups out of 294990. 86% done. Time elapsed: 64s. ETA: 10s.Processed 256804 groups out of 294990. 87% done. Time elapsed: 65s. ETA: 9s.Processed 261329 groups out of 294990. 89% done. Time elapsed: 66s. ETA: 8s.Processed 265966 groups out of 294990. 90% done. Time elapsed: 67s. ETA: 7s.Processed 270340 groups out of 294990. 92% done. Time elapsed: 68s. ETA: 6s.Processed 274865 groups out of 294990. 93% done. Time elapsed: 69s. ETA: 5s.Processed 279441 groups out of 294990. 95% done. Time elapsed: 70s. ETA: 3s.Processed 284000 groups out of 294990. 96% done. Time elapsed: 71s. ETA: 2s.Processed 288479 groups out of 294990. 98% done. Time elapsed: 72s. ETA: 1s.Processed 290781 groups out of 294990. 99% done. Time elapsed: 73s. ETA: 1s.Processed 294990 groups out of 294990. 100% done. Time elapsed: 74s. ETA: 0s.

Show model RMSE w/in each quantile

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x")+
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle("RMSE by decile for bestLambda model")
}

# get deciles for 1 SE lambda model 
if (length(prednames_secondBestMod) == 0) {
  print("The 1SE (or 1/2 SE) lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles_1se %>% 
    ggplot(aes(x = mean_value, y = RMSE)) +
    facet_wrap(~name, scales = "free_x")+
    geom_point(alpha = .2, size = .5) + 
    geom_smooth(lwd = .5) + 
    xlab("Scaled predictor value") + 
    ggtitle(paste0("RMSE by decile for ", name_secondBestMod, "model"))
}

Cross-validation

Using best lambda model

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
  
## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1_s$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response_t, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response_t])

  
# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>% 
    slice(train) %>% 
    dplyr::select(-newRegion)
  testDat_all <- modDat_1_s %>% 
    slice(test) %>% 
    dplyr::select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response_t]
  y_test <- modDat_1_s[test,response_t]
  
  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response_t]
  y_test_glob <- modDat_1_s[test,response_t]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv, 
    ,
               family =  stats::Gamma(link = "log")
    )
    
  # lasso model predictions with the optimal lambda (back transformed)
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response") - 2
  # null model and predictions
  # the "null" model in this case is the global model 
  # predict on the test data for this iteration w/ the global model (back transformed)
  null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response") - 2

  
  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),
                    obs=y_test-2,
                    pred_opt=optimal_pred, 
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)
  
  # calculate RMSE, bias, etc. of 
  # RMSE of CV model 
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred,"y_test" = (y_test-2)), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred,"y_test" = (y_test-2)), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean((y_test-2) - optimal_pred)
  # bias of global model
  bias_null <-  mean((y_test-2) - null_pred )
  
  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp, 
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal, 
                                      "RMSE_globalModel" = RMSE_null, 
                                      "bias_cvModel" = bias_optimal, 
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "COLD DESERTS"
## [1] "MEDITERRANEAN PIEDMONT"
## [1] "SEMIARID AND TEMPERATE PRAIRIES"
## [1] "WARM DESERTS"
## [1] "WEST-CENTRAL SEMIARID PRAIRIES"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

# table of model performance
purrr::map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>% 
  purrr::list_rbind() %>% 
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model", 
                      "bias of CV model - mean(obs-pred.)", "bias of global model- mean(obs-pred.)"), 
        caption = "Performance of Cross Validation using 'best lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Performance of Cross Validation using ‘best lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model- mean(obs-pred.)
COLD DESERTS 47.52123 23.97935 -11.098442 -0.4778954
MEDITERRANEAN PIEDMONT 39.35362 21.91749 -21.287012 -8.1886287
SEMIARID AND TEMPERATE PRAIRIES 16.53336 15.21839 5.628126 1.5610215
WARM DESERTS 25.72138 24.23846 4.628654 1.0447313
WEST-CENTRAL SEMIARID PRAIRIES 30.86329 22.16057 -7.221834 0.0075142
# visualize model predictions
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>%
  terra::set.crs(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- predictionData %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-100, 100))  + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

Using second best lambda model (either 1se or 1/2se)

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {

## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1_s$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(mod_secondBest, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response_t, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response_t])

  
# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>% 
    slice(train) %>% 
    dplyr::select(-newRegion)
  testDat_all <- modDat_1_s %>% 
    slice(test) %>% 
    dplyr::select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response_t]
  y_test <- modDat_1_s[test,response_t]
  
  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response_t]
  y_test_glob <- modDat_1_s[test,response_t]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv, 
               family =  stats::Gamma(link = "log")
    )

    coef(fit_i)
    
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response") - 2 
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model 
  null_pred <- predict.glm(mod_secondBest, newdata = testDat_all, type = "response") - 2

  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),
                    obs=y_test-2,
                    pred_opt=optimal_pred, 
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)
    
  # calculate RMSE, bias, etc. of 
  # RMSE of CV model 
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, "y_test" = (y_test-2)), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred,  "y_test" = (y_test-2)), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean((y_test-2) - optimal_pred)
  # bias of global model
  bias_null <-  mean((y_test-2) - null_pred )
  
  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp, 
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal, 
                                      "RMSE_globalModel" = RMSE_null, 
                                      "bias_cvModel" = bias_optimal, 
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "COLD DESERTS"
## [1] "MEDITERRANEAN PIEDMONT"
## [1] "SEMIARID AND TEMPERATE PRAIRIES"
## [1] "WARM DESERTS"
## [1] "WEST-CENTRAL SEMIARID PRAIRIES"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
# table of model performance
purrr::map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>% 
  purrr::list_rbind() %>% 
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model", 
                      "bias of CV model - mean(obs-pred.)", "bias of global model - mean(obs-pred.)"), 
        caption = "Performance of Cross Validation using '1 SE lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
}
Performance of Cross Validation using ‘1 SE lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model - mean(obs-pred.)
COLD DESERTS 34.74487 24.67384 4.1608344 -0.5162047
MEDITERRANEAN PIEDMONT 42.13254 26.73339 -25.1281672 -12.7305425
SEMIARID AND TEMPERATE PRAIRIES 15.83326 15.54354 0.3935225 -0.1849678
WARM DESERTS 25.43259 24.44564 5.2404650 1.2569647
WEST-CENTRAL SEMIARID PRAIRIES 25.15915 21.93052 -1.0220980 0.9129080
if (length(prednames_secondBestMod) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>%
  terra::set.crs(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- predictionData %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "grey20",
                       limits = c(-100, 100))  + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
}
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

Save output

## save the coefficients for the models (best lambda, 1/2se lambda, 1se lambda)
if(trimAnom == TRUE) {
saveRDS(coefs, file = paste0("./models/modelCoefficients_trimAnom_", ecoregion, "_", response,".rds")) 
saveRDS(uniqueCoeffs, file = paste0("./models/modelMetrics_trimAnom_", ecoregion, "_", response,".rds")) 
} else {
saveRDS(coefs, file = paste0("./models/modelCoefficients_", ecoregion, "_", response,".rds")) 
saveRDS(uniqueCoeffs, file = paste0("./models/modelMetrics_", ecoregion, "_", response,".rds")) 
}
# make a table
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(fit)

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "3626465cd23dd1b08fb4ad6058d5626267d2e6f2"

Packages etc.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.7.6
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0               doMC_1.3.8                 iterators_1.0.14           foreach_1.5.2              factoextra_1.0.7          
##  [6] USA.state.boundaries_1.0.1 glmnet_4.1-8               Matrix_1.7-0               kableExtra_1.4.0           rsample_1.2.1             
## [11] here_1.0.1                 StepBeta_2.1.0             ggtext_0.1.2               knitr_1.49                 gridExtra_2.3             
## [16] pdp_0.8.2                  GGally_2.2.1               lubridate_1.9.4            forcats_1.0.0              stringr_1.5.1             
## [21] dplyr_1.1.4                purrr_1.0.4                readr_2.1.5                tidyr_1.3.1                tibble_3.2.1              
## [26] tidyverse_2.0.0            caret_6.0-94               lattice_0.22-6             ggplot2_3.5.1              sf_1.0-20                 
## [31] tidyterra_0.6.1            terra_1.8-21               ggspatial_1.1.9            dtplyr_1.3.1               patchwork_1.3.0           
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3   rstudioapi_0.17.1    jsonlite_1.9.1       shape_1.4.6.1        magrittr_2.0.3       modeltools_0.2-23   
##   [7] farver_2.1.2         rmarkdown_2.29       vctrs_0.6.5          rstatix_0.7.2        htmltools_0.5.8.1    broom_1.0.7         
##  [13] Formula_1.2-5        pROC_1.18.5          sass_0.4.9           parallelly_1.37.1    KernSmooth_2.23-22   bslib_0.9.0         
##  [19] plyr_1.8.9           sandwich_3.1-0       zoo_1.8-12           cachem_1.1.0         commonmark_1.9.1     lifecycle_1.0.4     
##  [25] pkgconfig_2.0.3      R6_2.6.1             fastmap_1.2.0        future_1.33.2        digest_0.6.37        colorspace_2.1-1    
##  [31] furrr_0.3.1          rprojroot_2.0.4      labeling_0.4.3       yardstick_1.3.1      timechange_0.3.0     mgcv_1.9-1          
##  [37] abind_1.4-8          compiler_4.4.0       proxy_0.4-27         aod_1.3.3            withr_3.0.2          backports_1.5.0     
##  [43] carData_3.0-5        betareg_3.1-4        DBI_1.2.3            ggstats_0.9.0        ggsignif_0.6.4       MASS_7.3-60.2       
##  [49] lava_1.8.0           classInt_0.4-10      gtools_3.9.5         ModelMetrics_1.2.2.2 tools_4.4.0          units_0.8-5         
##  [55] lmtest_0.9-40        future.apply_1.11.2  nnet_7.3-19          glue_1.8.0           nlme_3.1-164         gridtext_0.1.5      
##  [61] grid_4.4.0           reshape2_1.4.4       generics_0.1.3       recipes_1.1.0        gtable_0.3.6         tzdb_0.4.0          
##  [67] class_7.3-22         data.table_1.17.0    hms_1.1.3            car_3.1-2            xml2_1.3.7           flexmix_2.3-19      
##  [73] markdown_1.13        ggrepel_0.9.5        pillar_1.10.1        splines_4.4.0        survival_3.5-8       tidyselect_1.2.1    
##  [79] svglite_2.1.3        stats4_4.4.0         xfun_0.51            hardhat_1.4.0        timeDate_4032.109    stringi_1.8.4       
##  [85] yaml_2.3.10          evaluate_1.0.3       codetools_0.2-20     cli_3.6.4            rpart_4.1.23         systemfonts_1.2.1   
##  [91] munsell_0.5.1        jquerylib_0.1.4      Rcpp_1.0.14          globals_0.16.3       gower_1.0.1          listenv_0.9.1       
##  [97] viridisLite_0.4.2    ipred_0.9-15         scales_1.3.0         prodlim_2024.06.25   e1071_1.7-14         combinat_0.0-8      
## [103] rlang_1.1.5          cowplot_1.1.3